import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#---#
from autogluon.tabular import TabularPredictor
import autogluon.eda.auto as auto
#---#
import warnings
'ignore') warnings.filterwarnings(
= [1.0,3000,2,1]
X = [0,1,0,0]
y = [2.0, 2800]
XX = [0,1]
yy = pd.DataFrame([X,y]).transpose().rename(columns={0:'X', 1:'y'}) df_tr
df_tr
X | y | |
---|---|---|
0 | 1.0 | 0.0 |
1 | 3000.0 | 1.0 |
2 | 2.0 | 0.0 |
3 | 1.0 | 0.0 |
= pd.DataFrame([XX,yy]).transpose().rename(columns={0:'X', 1:'y'}) df_tst
df_tst
X | y | |
---|---|---|
0 | 2.0 | 0.0 |
1 | 2800.0 | 1.0 |
= TabularPredictor(label='y') predictor
No path specified. Models will be saved in: "AutogluonModels/ag-20240201_083938/"
= predictor.fit(df_tr) path
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20240201_083938/"
AutoGluon Version: 0.8.2
Python Version: 3.8.18
Operating System: Linux
Platform Machine: x86_64
Platform Version: #38~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Nov 2 18:01:13 UTC 2
Disk Space Avail: 598.15 GB / 982.82 GB (60.9%)
Train Data Rows: 4
Train Data Columns: 1
Label Column: y
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
2 unique label values: [0.0, 1.0]
If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Warning: Updated label_count_threshold from 10 to 1 to avoid cutting too many classes.
Warning: Updated holdout_frac from 0.2 to 0.251 to avoid cutting too many classes.
Selected class <--> label mapping: class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
Available Memory: 50963.24 MB
Train Data (Original) Memory Usage: 0.0 MB (0.0% of available memory)
Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
Stage 1 Generators:
Fitting AsTypeFeatureGenerator...
Stage 2 Generators:
Fitting FillNaFeatureGenerator...
Stage 3 Generators:
Fitting IdentityFeatureGenerator...
Stage 4 Generators:
Fitting DropUniqueFeatureGenerator...
Stage 5 Generators:
Fitting DropDuplicatesFeatureGenerator...
Types of features in original data (raw dtype, special dtypes):
('float', []) : 1 | ['X']
Types of features in processed data (raw dtype, special dtypes):
('float', []) : 1 | ['X']
0.0s = Fit runtime
1 features in original data used to generate 1 features in processed data.
Train Data (Processed) Memory Usage: 0.0 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 0.04s ...
AutoGluon will gauge predictive performance using evaluation metric: 'accuracy'
To change this, specify the eval_metric parameter of Predictor()
Automatically generating train/validation split with holdout_frac=0.251, Train Rows: 3, Val Rows: 1
User-specified model hyperparameters to be fit:
{
'NN_TORCH': {},
'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'],
'CAT': {},
'XGB': {},
'FASTAI': {},
'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],
}
Fitting 13 L1 models ...
Fitting model: KNeighborsUnif ...
Warning: Exception caused KNeighborsUnif to fail during training... Skipping this model.
Expected n_neighbors <= n_samples, but n_samples = 3, n_neighbors = 5
Detailed Traceback:
Traceback (most recent call last):
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/autogluon/core/trainer/abstract_trainer.py", line 1755, in _train_and_save
y_pred_proba_val = model.predict_proba(X_val)
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/autogluon/core/models/abstract/abstract_model.py", line 931, in predict_proba
y_pred_proba = self._predict_proba(X=X, **kwargs)
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/autogluon/core/models/abstract/abstract_model.py", line 949, in _predict_proba
y_pred_proba = self.model.predict_proba(X)
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/sklearn/neighbors/_classification.py", line 283, in predict_proba
neigh_ind = self.kneighbors(X, return_distance=False)
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/sklearn/neighbors/_base.py", line 810, in kneighbors
raise ValueError(
ValueError: Expected n_neighbors <= n_samples, but n_samples = 3, n_neighbors = 5
Fitting model: KNeighborsDist ...
Warning: Exception caused KNeighborsDist to fail during training... Skipping this model.
Expected n_neighbors <= n_samples, but n_samples = 3, n_neighbors = 5
Detailed Traceback:
Traceback (most recent call last):
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/autogluon/core/trainer/abstract_trainer.py", line 1755, in _train_and_save
y_pred_proba_val = model.predict_proba(X_val)
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/autogluon/core/models/abstract/abstract_model.py", line 931, in predict_proba
y_pred_proba = self._predict_proba(X=X, **kwargs)
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/autogluon/core/models/abstract/abstract_model.py", line 949, in _predict_proba
y_pred_proba = self.model.predict_proba(X)
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/sklearn/neighbors/_classification.py", line 286, in predict_proba
neigh_dist, neigh_ind = self.kneighbors(X)
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/sklearn/neighbors/_base.py", line 810, in kneighbors
raise ValueError(
ValueError: Expected n_neighbors <= n_samples, but n_samples = 3, n_neighbors = 5
Fitting model: LightGBMXT ...
1.0 = Validation score (accuracy)
0.29s = Training runtime
0.0s = Validation runtime
Fitting model: LightGBM ...
1.0 = Validation score (accuracy)
0.28s = Training runtime
0.0s = Validation runtime
Fitting model: RandomForestGini ...
1.0 = Validation score (accuracy)
0.25s = Training runtime
0.02s = Validation runtime
Fitting model: RandomForestEntr ...
1.0 = Validation score (accuracy)
0.24s = Training runtime
0.02s = Validation runtime
Fitting model: CatBoost ...
1.0 = Validation score (accuracy)
0.09s = Training runtime
0.0s = Validation runtime
Fitting model: ExtraTreesGini ...
1.0 = Validation score (accuracy)
0.24s = Training runtime
0.02s = Validation runtime
Fitting model: ExtraTreesEntr ...
1.0 = Validation score (accuracy)
0.24s = Training runtime
0.02s = Validation runtime
Fitting model: NeuralNetFastAI ...
No improvement since epoch 0: early stopping
0.0 = Validation score (accuracy)
0.08s = Training runtime
0.0s = Validation runtime
Fitting model: XGBoost ...
1.0 = Validation score (accuracy)
0.04s = Training runtime
0.0s = Validation runtime
Fitting model: NeuralNetTorch ...
Warning: Exception caused NeuralNetTorch to fail during training... Skipping this model.
float division by zero
Detailed Traceback:
Traceback (most recent call last):
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/autogluon/core/trainer/abstract_trainer.py", line 1733, in _train_and_save
model = self._train_single(X, y, model, X_val, y_val, total_resources=total_resources, **model_fit_kwargs)
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/autogluon/core/trainer/abstract_trainer.py", line 1684, in _train_single
model = model.fit(X=X, y=y, X_val=X_val, y_val=y_val, total_resources=total_resources, **model_fit_kwargs)
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/autogluon/core/models/abstract/abstract_model.py", line 829, in fit
out = self._fit(**kwargs)
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/autogluon/tabular/models/tabular_nn/torch/tabular_nn_torch.py", line 207, in _fit
self._train_net(
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/autogluon/tabular/models/tabular_nn/torch/tabular_nn_torch.py", line 365, in _train_net
f"Epoch {epoch} (Update {total_updates}).\t"
ZeroDivisionError: float division by zero
Fitting model: LightGBMLarge ...
1.0 = Validation score (accuracy)
0.28s = Training runtime
0.0s = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
1.0 = Validation score (accuracy)
0.24s = Training runtime
0.0s = Validation runtime
AutoGluon training complete, total runtime = 2.5s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20240201_083938/")
=True) predictor.evaluate(df_tst, silent
{'accuracy': 1.0,
'balanced_accuracy': 1.0,
'mcc': 1.0,
'roc_auc': 1.0,
'f1': 1.0,
'precision': 1.0,
'recall': 1.0}
predictr.predict(df_tst)
0 0.0
1 1.0
Name: y, dtype: float64
=True) predictr.leaderboard(silent
model | score_val | pred_time_val | fit_time | pred_time_val_marginal | fit_time_marginal | stack_level | can_infer | fit_order | |
---|---|---|---|---|---|---|---|---|---|
0 | LightGBMLarge | 1.0 | 0.000750 | 0.263480 | 0.000750 | 0.263480 | 1 | True | 10 |
1 | LightGBM | 1.0 | 0.000790 | 0.269578 | 0.000790 | 0.269578 | 1 | True | 2 |
2 | CatBoost | 1.0 | 0.000808 | 0.087998 | 0.000808 | 0.087998 | 1 | True | 5 |
3 | LightGBMXT | 1.0 | 0.000830 | 0.266466 | 0.000830 | 0.266466 | 1 | True | 1 |
4 | XGBoost | 1.0 | 0.001658 | 0.041625 | 0.001658 | 0.041625 | 1 | True | 9 |
5 | RandomForestGini | 1.0 | 0.017526 | 0.241953 | 0.017526 | 0.241953 | 1 | True | 3 |
6 | ExtraTreesEntr | 1.0 | 0.017991 | 0.240180 | 0.017991 | 0.240180 | 1 | True | 7 |
7 | ExtraTreesGini | 1.0 | 0.018317 | 0.243194 | 0.018317 | 0.243194 | 1 | True | 6 |
8 | RandomForestEntr | 1.0 | 0.018408 | 0.241831 | 0.018408 | 0.241831 | 1 | True | 4 |
9 | WeightedEnsemble_L2 | 1.0 | 0.018666 | 0.497928 | 0.000349 | 0.254734 | 2 | True | 11 |
10 | NeuralNetFastAI | 0.0 | 0.003675 | 0.084893 | 0.003675 | 0.084893 | 1 | True | 8 |
오토글루온에서 적합되는 저 모델들을.. 하나하나 model명을 써주고 그 모델에 맞춰 정리한 yy값을 가지고 …………………….. df형식으로 결과값 정리가 필요!
path.path
'AutogluonModels/ag-20240201_082455/'
path.fit_summary()
*** Summary of fit() ***
Estimated performance of each model:
model score_val pred_time_val fit_time pred_time_val_marginal fit_time_marginal stack_level can_infer fit_order
0 LightGBMXT 1.0 0.000718 0.285295 0.000718 0.285295 1 True 1
1 LightGBM 1.0 0.000762 0.281390 0.000762 0.281390 1 True 2
2 LightGBMLarge 1.0 0.000781 0.275127 0.000781 0.275127 1 True 10
3 CatBoost 1.0 0.000834 0.086406 0.000834 0.086406 1 True 5
4 XGBoost 1.0 0.001617 0.040825 0.001617 0.040825 1 True 9
5 RandomForestEntr 1.0 0.017598 0.241371 0.017598 0.241371 1 True 4
6 ExtraTreesEntr 1.0 0.018220 0.240957 0.018220 0.240957 1 True 7
7 ExtraTreesGini 1.0 0.018498 0.236784 0.018498 0.236784 1 True 6
8 WeightedEnsemble_L2 1.0 0.018856 0.488569 0.000358 0.251786 2 True 11
9 RandomForestGini 1.0 0.019265 0.241472 0.019265 0.241472 1 True 3
10 NeuralNetFastAI 0.0 0.003558 0.081848 0.003558 0.081848 1 True 8
Number of models trained: 11
Types of models trained:
{'RFModel', 'CatBoostModel', 'XTModel', 'NNFastAiTabularModel', 'WeightedEnsembleModel', 'LGBModel', 'XGBoostModel'}
Bagging used: False
Multi-layer stack-ensembling used: False
Feature Metadata (Processed):
(raw dtype, special dtypes):
('float', []) : 1 | ['X']
*** End of fit() summary ***
{'model_types': {'LightGBMXT': 'LGBModel',
'LightGBM': 'LGBModel',
'RandomForestGini': 'RFModel',
'RandomForestEntr': 'RFModel',
'CatBoost': 'CatBoostModel',
'ExtraTreesGini': 'XTModel',
'ExtraTreesEntr': 'XTModel',
'NeuralNetFastAI': 'NNFastAiTabularModel',
'XGBoost': 'XGBoostModel',
'LightGBMLarge': 'LGBModel',
'WeightedEnsemble_L2': 'WeightedEnsembleModel'},
'model_performance': {'LightGBMXT': 1.0,
'LightGBM': 1.0,
'RandomForestGini': 1.0,
'RandomForestEntr': 1.0,
'CatBoost': 1.0,
'ExtraTreesGini': 1.0,
'ExtraTreesEntr': 1.0,
'NeuralNetFastAI': 0.0,
'XGBoost': 1.0,
'LightGBMLarge': 1.0,
'WeightedEnsemble_L2': 1.0},
'model_best': 'WeightedEnsemble_L2',
'model_paths': {'LightGBMXT': 'AutogluonModels/ag-20240201_082455/models/LightGBMXT/',
'LightGBM': 'AutogluonModels/ag-20240201_082455/models/LightGBM/',
'RandomForestGini': 'AutogluonModels/ag-20240201_082455/models/RandomForestGini/',
'RandomForestEntr': 'AutogluonModels/ag-20240201_082455/models/RandomForestEntr/',
'CatBoost': 'AutogluonModels/ag-20240201_082455/models/CatBoost/',
'ExtraTreesGini': 'AutogluonModels/ag-20240201_082455/models/ExtraTreesGini/',
'ExtraTreesEntr': 'AutogluonModels/ag-20240201_082455/models/ExtraTreesEntr/',
'NeuralNetFastAI': 'AutogluonModels/ag-20240201_082455/models/NeuralNetFastAI/',
'XGBoost': 'AutogluonModels/ag-20240201_082455/models/XGBoost/',
'LightGBMLarge': 'AutogluonModels/ag-20240201_082455/models/LightGBMLarge/',
'WeightedEnsemble_L2': 'AutogluonModels/ag-20240201_082455/models/WeightedEnsemble_L2/'},
'model_fit_times': {'LightGBMXT': 0.2852945327758789,
'LightGBM': 0.2813901901245117,
'RandomForestGini': 0.2414722442626953,
'RandomForestEntr': 0.24137091636657715,
'CatBoost': 0.08640623092651367,
'ExtraTreesGini': 0.23678374290466309,
'ExtraTreesEntr': 0.24095678329467773,
'NeuralNetFastAI': 0.08184814453125,
'XGBoost': 0.04082489013671875,
'LightGBMLarge': 0.2751274108886719,
'WeightedEnsemble_L2': 0.2517857551574707},
'model_pred_times': {'LightGBMXT': 0.0007178783416748047,
'LightGBM': 0.0007622241973876953,
'RandomForestGini': 0.019264698028564453,
'RandomForestEntr': 0.01759791374206543,
'CatBoost': 0.0008337497711181641,
'ExtraTreesGini': 0.01849818229675293,
'ExtraTreesEntr': 0.018220186233520508,
'NeuralNetFastAI': 0.0035576820373535156,
'XGBoost': 0.0016171932220458984,
'LightGBMLarge': 0.0007805824279785156,
'WeightedEnsemble_L2': 0.0003581047058105469},
'num_bag_folds': 0,
'max_stack_level': 2,
'num_classes': 2,
'model_hyperparams': {'LightGBMXT': {'learning_rate': 0.05,
'extra_trees': True},
'LightGBM': {'learning_rate': 0.05},
'RandomForestGini': {'n_estimators': 300,
'max_leaf_nodes': 15000,
'n_jobs': -1,
'random_state': 0,
'bootstrap': True,
'criterion': 'gini'},
'RandomForestEntr': {'n_estimators': 300,
'max_leaf_nodes': 15000,
'n_jobs': -1,
'random_state': 0,
'bootstrap': True,
'criterion': 'entropy'},
'CatBoost': {'iterations': 10000,
'learning_rate': 0.05,
'random_seed': 0,
'allow_writing_files': False,
'eval_metric': 'Accuracy'},
'ExtraTreesGini': {'n_estimators': 300,
'max_leaf_nodes': 15000,
'n_jobs': -1,
'random_state': 0,
'bootstrap': True,
'criterion': 'gini'},
'ExtraTreesEntr': {'n_estimators': 300,
'max_leaf_nodes': 15000,
'n_jobs': -1,
'random_state': 0,
'bootstrap': True,
'criterion': 'entropy'},
'NeuralNetFastAI': {'layers': None,
'emb_drop': 0.1,
'ps': 0.1,
'bs': 'auto',
'lr': 0.01,
'epochs': 'auto',
'early.stopping.min_delta': 0.0001,
'early.stopping.patience': 20,
'smoothing': 0.0},
'XGBoost': {'n_estimators': 10000,
'learning_rate': 0.1,
'n_jobs': -1,
'proc.max_category_levels': 100,
'objective': 'binary:logistic',
'booster': 'gbtree'},
'LightGBMLarge': {'learning_rate': 0.03,
'num_leaves': 128,
'feature_fraction': 0.9,
'min_data_in_leaf': 5},
'WeightedEnsemble_L2': {'use_orig_features': False,
'max_base_models': 25,
'max_base_models_per_type': 5,
'save_bag_folds': True}},
'leaderboard': model score_val pred_time_val fit_time \
0 LightGBMXT 1.0 0.000718 0.285295
1 LightGBM 1.0 0.000762 0.281390
2 LightGBMLarge 1.0 0.000781 0.275127
3 CatBoost 1.0 0.000834 0.086406
4 XGBoost 1.0 0.001617 0.040825
5 RandomForestEntr 1.0 0.017598 0.241371
6 ExtraTreesEntr 1.0 0.018220 0.240957
7 ExtraTreesGini 1.0 0.018498 0.236784
8 WeightedEnsemble_L2 1.0 0.018856 0.488569
9 RandomForestGini 1.0 0.019265 0.241472
10 NeuralNetFastAI 0.0 0.003558 0.081848
pred_time_val_marginal fit_time_marginal stack_level can_infer \
0 0.000718 0.285295 1 True
1 0.000762 0.281390 1 True
2 0.000781 0.275127 1 True
3 0.000834 0.086406 1 True
4 0.001617 0.040825 1 True
5 0.017598 0.241371 1 True
6 0.018220 0.240957 1 True
7 0.018498 0.236784 1 True
8 0.000358 0.251786 2 True
9 0.019265 0.241472 1 True
10 0.003558 0.081848 1 True
fit_order
0 1
1 2
2 10
3 5
4 9
5 4
6 7
7 6
8 11
9 3
10 8 }
autogluon_amt에서 필요한 것
def autogluon_amt(fraudTrain,):
= fraudTrain[['amt','is_fraud']]
fraudTrain = []
model = []
time_diff = []
acc = []
pre = []
rec = []
f1 = []
auc = []
graph_based = []
pyod = []
train_size = []
train_cols = []
train_frate = []
test_size = []
test_frate = []
hyper_params for name, predictor in predictors.items():
= time.time()
t1
predictor.fit(X,y)= time.time()
t2 = predictor.predict(XX)
yyhat = evaluate(yy,yyhat)
scores
model.append(name)-t1)
time_diff.append(t2'acc'])
acc.append(scores['pre'])
pre.append(scores['rec'])
rec.append(scores['f1'])
f1.append(scores['auc'])
auc.append(scores[False)
graph_based.append(True)
pyod.append(len(y)),
train_size.append(list(X.columns)),
train_cols.append(-1).mean()),
train_frate.append(np.array(y).reshape(len(yy)),
test_size.append(-1).mean())
test_frate.append(np.array(yy).reshape(None)
hyper_params.append(= pd.DataFrame(dict(
df_results = model,
model =time_diff,
time=acc,
acc=pre,
pre=rec,
rec=f1,
f1=auc,
auc= graph_based,
graph_based = pyod,
pyod = throw_rate,
throw_rate = train_size,
train_size = train_cols,
train_cols = train_frate,
train_frate = test_size,
test_size = test_frate,
test_frate = hyper_params
hyper_params
))= datetime.datetime.fromtimestamp(time.time()).strftime('%Y%m%d-%H%M%S')
ymdhms f'./results/{ymdhms}-pyod.csv',index=False)
df_results.to_csv(return df_results
def throw(df, fraud_rate, random_state=42): # 사기 거래 비율에 맞춰 버려지는 함수!
= df[df['is_fraud'] == 1].copy()
df1 = df[df['is_fraud'] == 0].copy()
df0 = (len(df1) * (1-fraud_rate)) / (len(df0) * fraud_rate)
df0_downsample = df0.sample(frac=df0_downsample, random_state=random_state)
df0_down = pd.concat([df1, df0_down])
df_p return df_p
import pickle
with open('fraudTrain.pkl', 'rb') as file:
= pickle.load(file) fraudTrain
= throw(fraudTrain, 0.3) test
test.is_fraud.mean()
0.3
fraudTrain.is_fraud.mean()
0.005727773406766326